import catboost
print(catboost.__version__)
!python --version
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
import pandas as pd
import os
import numpy as np
np.set_printoptions(precision=4)
import catboost
from catboost import *
from catboost import datasets
from sklearn.model_selection import train_test_split
from pathlib import Path
DIR=Path('../listing_price_suggest.csv')
DIR
df=pd.read_csv(DIR)
df.head()
## Split Train-Test split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,2:],df['y_var'], test_size=0.33, random_state=5)
data_dict={'X_train':X_train,'X_test':X_test,'y_train':y_train,'y_test':y_test}
cat_columns=df.columns[df.columns.str.contains("cat")].tolist()
con_columns=df.columns[~df.columns.str.contains("cat")].tolist()
cat_columns,con_columns
from catboost import CatBoostRegressor
model = CatBoostRegressor(
iterations=15,
learning_rate=0.1,
# loss_function='CrossEntropy'
)
model.fit(
X_train, y_train,
cat_features=cat_columns,
eval_set=(X_test, y_test),
verbose=True
)
print('Model is fitted: ' + str(model.is_fitted()))
print('Model params:')
print(model.get_params())
model = CatBoostRegressor(
iterations=50,
random_seed=63,
learning_rate=0.5,
)
model.fit(
X_train, y_train,
cat_features=cat_columns,
eval_set=(X_test, y_test),
verbose=False,
plot=True
)
model1 = CatBoostRegressor(
learning_rate=0.7,
iterations=100,
random_seed=0,
train_dir='learing_rate_0.7'
)
model2 = CatBoostRegressor(
learning_rate=0.01,
iterations=100,
random_seed=0,
train_dir='learing_rate_0.01'
)
model1.fit(
X_train, y_train,
eval_set=(X_test, y_test),
cat_features=cat_columns,
verbose=False
)
model2.fit(
X_train, y_train,
eval_set=(X_test, y_test),
cat_features=cat_columns,
verbose=False
)
from catboost import MetricVisualizer
MetricVisualizer(['learing_rate_0.01', 'learing_rate_0.7']).start()
model = CatBoostRegressor(
iterations=1000,
random_seed=63,
learning_rate=0.5,
# use_best_model=False
)
model.fit(
X_train, y_train,
cat_features=cat_columns,
eval_set=(X_test, y_test),
verbose=False,
plot=True
)
from catboost import cv
params = {}
params['loss_function'] = 'MAE'
params['iterations'] = 100
params['custom_loss'] = 'RMSE'
params['random_seed'] = 63
params['learning_rate'] = 0.5
cv_data = cv(
params = params,
pool = Pool(df.iloc[:,2:], label=df['y_var'], cat_features=cat_columns),
fold_count=5,
shuffle=True,
partition_random_seed=0,
plot=True,
stratified=False,
verbose=False
)
cv_data
grid = {'learning_rate': [0.03, 0.1],
'depth': [4, 6, 10],
'l2_leaf_reg': [1, 3, 5, 7, 9]}
model_search = CatBoostRegressor(
iterations=100,
random_seed=63,
learning_rate=0.5,
# use_best_model=False
)
randomized_search_result = model_search.randomized_search(grid,
X=df.iloc[:,2:],
y=df['y_var'],
plot=True)
model.get_feature_importance(prettified=True)
d=model.plot_tree(
tree_idx=0,
pool=pool1
)
d
pool1 = Pool(data=df.iloc[:,2:], label=df['y_var'], cat_features=cat_columns)
shap_values = model.get_feature_importance(pool1,type='Interaction')
expected_value = shap_values[0,-1]
shap_values = shap_values[:,:-1]
print(shap_values.shape)
shap_values
shap.initjs()
shap.force_plot(expected_value, shap_values[3,:], df.iloc[3,:])